The captions in the graph can represent the flow of data exploration. # Import and name variables and factors properly
library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ dplyr 1.0.7
## ✔ tibble 3.1.8 ✔ stringr 1.4.0
## ✔ tidyr 1.1.4 ✔ forcats 0.5.1
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
SeedMethod <- read_csv("/Users/panareeboonyuen/Thai-rice-open-data-Github/data/raw-data/MethodSeedDensityAnalysis.csv")
## Rows: 894 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): province ENG, method
## dbl (9): year, ID_01, planting area(rai), harvesting area (rai), production ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(SeedMethod)[11]<- "SeedUtilise"
names(SeedMethod)[9]<- "YieldHA"
names(SeedMethod)[8]<- "YieldPA"
names(SeedMethod)[3]<- "Province"
#Change "PlacingMachine" to a better term "SowingMachine"
SeedMethod$method[SeedMethod$method == "PlacingMachine"] <- "DirectSow"
SeedMethod %>%
ggplot() +
geom_point(mapping = aes(x=SeedUtilise,y=YieldHA, col = method))+
labs(x=" Seed Utilisation rate (Kg/rai)", y = "Yield (Kg/rai)",title = "Whole-country level", caption = "Any pattern of yield increase due to the increase in the seed use rate?\nFor Transplant and direct sow, the more seed use, the higher the yield. Homogenous trends.\nDry and Wet broadcast produced U-shaped trend. It can be due to year/region/province-specific effect.")
#What if we are dividing the data by year
SeedMethod %>%
ggplot() +
geom_point(mapping = aes(x=SeedUtilise,y=YieldPA, col = method))+
labs(x=" Seed Utilisation rate (Kg/rai)", y = "Yield per Planting Area (Kg/rai)", caption = "Each year sees the same trend, so it is not a year-specific effect")+
facet_wrap(~(year))
#This means it is not a year-specific effect
region_provincelist <- read_csv("data/raw-data/region-provincellsit.csv",col_names = F)
## Rows: 6 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): X1, X2
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
NorthEast<- region_provincelist[1,2]
North <- region_provincelist[2,2]
South <- region_provincelist[3,2]
Central <- region_provincelist[4,2]
East <- region_provincelist[5,2]
West <- region_provincelist[6,2]
library(stringr)
FormProvinceStr <- function(Listcomma) {
Listcomma <- str_replace_all(Listcomma, " ", "")
Listcomma2 <- str_split(Listcomma, ",", simplify = TRUE)
return(Listcomma2)
}
#use with each list
North <- FormProvinceStr(North)
NorthEast <- FormProvinceStr(NorthEast)
South <- FormProvinceStr(South)
Central <- FormProvinceStr(Central)
East <- FormProvinceStr(East)
West <- FormProvinceStr(West)
SeedMethod$Province<-str_replace_all(SeedMethod$Province, " ", "")
provinceList <- c(NorthEast, North, South, Central, East, West)
provinceList <-
provinceList[order(provinceList)] #order alphabetically
#length(provinceList) == 77 checked
SeedMethod<-SeedMethod %>%
mutate(
region = case_when(
match(Province,NorthEast)>0 ~ "NorthEast",
match(Province,North)>0 ~ "North",
match(Province,South)>0 ~ "South",
match(Province,Central)>0 ~ "Central",
match(Province,East)>0 ~ "East",
match(Province,West)>0 ~ "West",
TRUE ~ "CheckSpell"
)
)
SeedMethod %>%
ggplot() +
geom_point(mapping = aes(x=SeedUtilise,y=YieldHA, col = method), size = 1)+
labs(x=" Seed Utilisation rate (Kg/rai)", y = "Yield per Planting Area (Kg/rai)", caption = "#North have two types of responses of yields to seed use increase, for Dry and Wet Broadcast,\nwhile other regions had consistent trends (positive correlation)\n#NE was not competitive at all for Dry Broadcast and Wet Broadcast\n(but this may be because of low-yielding KDML105 was planted the most in NE)\n#No Sowing machine method for Central and East seem fishy
#No other management and cultivar information! - the problem of confounding factor
")+
facet_wrap(~(region))
SeedMethod %>% filter(region == "North") %>% filter(method == "DryBroadcast") %>%
ggplot() +
geom_point(mapping = aes(x=SeedUtilise,y=YieldHA, col = Province))+
labs(x=" Seed Utilisation rate (Kg/rai)", y = "Yield (Kg/rai)", title = "Northern province level - DryBroadcast")+
scale_y_continuous(limits = c(300,750))
SeedMethod %>% filter(region == "North") %>% filter(method == "WetBroadcast") %>%
ggplot() +
geom_point(mapping = aes(x=SeedUtilise,y=YieldHA, col = Province))+
labs(x=" Seed Utilisation rate (Kg/rai)", y = "Yield (Kg/rai)", title = "Northern province level - WetBroadcast")+
scale_y_continuous(limits = c(300,750))
cat("Two schools of thought for how much seed they should use
Use low seed amount = ChaingMai, ChaingRai, Phrae, LamPang, LamPhun, Phayao
Use high seed amount = Uttaradit, kamphaengPhet, Phetchabun, Phichit, Phitsanulok, Sukhothai\n")
## Two schools of thought for how much seed they should use
## Use low seed amount = ChaingMai, ChaingRai, Phrae, LamPang, LamPhun, Phayao
## Use high seed amount = Uttaradit, kamphaengPhet, Phetchabun, Phichit, Phitsanulok, Sukhothai
cat("Questions:
1. What forces high-seed provinces to use a higher seed amount, which is a waste of money
2. What if they use lower seed?
3. What cause high variation in low-seed use provinces?
4. What make CM having so high yield")
## Questions:
## 1. What forces high-seed provinces to use a higher seed amount, which is a waste of money
## 2. What if they use lower seed?
## 3. What cause high variation in low-seed use provinces?
## 4. What make CM having so high yield
SeedMethod %>%
ggplot() +
geom_point(mapping = aes(x=SeedUtilise,y=YieldPA, col = method))+
labs(x=" Seed Utilisation rate (Kg/rai)", y = "Yield per Planting Area (Kg/rai)")+
facet_wrap(~Province)
#we can see that there are too few data points to analyse each of the provinces
ggplot(data = SeedMethod) +
geom_bar(mapping=aes(x=Province, fill = method), orientation = "x")+
theme(axis.text.x =element_text(angle = 90, size = 5))
#filter only high count province
ObsCount<-SeedMethod %>% group_by(Province) %>% summarise(n = n())
HighObsCount <- ObsCount %>% filter(n>=15)
#There are 17 provinces with equal or more than 15 obeservations
SeedMethod %>% filter(match(Province,HighObsCount$Province)>0) %>%
ggplot() +
geom_point(mapping = aes(x=SeedUtilise,y=YieldHA, col = method))+
labs(x=" Seed Utilisation rate (Kg/rai)", y = "Yield per Planting Area (Kg/rai)", caption = "The distribution of data points are so similar;\neach of the four corners are represented by the same methods\nBut we cannot just recommend the whole country to do transplanting\nas we do not have other information e.g. cultivar and irrigation status")+
facet_wrap(~Province)
From DSSAT, simulation shows that Direct sowing (DS) give lower, more spread yields, while transplanting (TP) give higher, less spread yield. We will check that with the real data from OAE
ggplot(SeedMethod) +
geom_histogram(mapping = aes(YieldHA, fill = region), position = "stack")+
facet_wrap(~method)+
labs(caption = "From the whole-country look,
Mean yield from DS fields is indeed smaller than that of TP DS has a narrower spread than TP.
However, them DS data come from only 4 regions (no East and Central),\nwhile TP data come from 6, so we will plot them by region. ", x = "Yield (kg/rai)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Region considered
ggplot(SeedMethod %>% filter(method %in% c("DirectSow", "Transplant"))) +
geom_histogram(mapping = aes(YieldHA, fill = region), position = "identity")+
facet_grid(method~region)+
theme(axis.text.x = element_text(angle = 90, size = 5))+
labs(x = "Yield (kg/rai)", caption = "Comparing the histograms of DS and TP of the same region,
TP created a wider spread of yields.
Mean yield shows the sample expected trends
")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#get stat
SeedMethod %>% group_by(method, region) %>% summarise(n = n(), meanY = round(mean(YieldHA),digits = 2), sdY = round(sd(YieldHA),digits = 2), CVY = round(mean(YieldHA)/sd(YieldHA), digits = 2)) %>% filter(method %in% c("DirectSow", "Transplant"))
## `summarise()` has grouped output by 'method'. You can override using the
## `.groups` argument.
## # A tibble: 10 × 6
## # Groups: method [2]
## method region n meanY sdY CVY
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 DirectSow North 39 404. 67.2 6.01
## 2 DirectSow NorthEast 34 344. 29.4 11.7
## 3 DirectSow South 24 324. 32.7 9.92
## 4 DirectSow West 8 392. 84.6 4.63
## 5 Transplant Central 44 676. 50.1 13.5
## 6 Transplant East 32 486. 107. 4.55
## 7 Transplant North 56 583. 49.9 11.7
## 8 Transplant NorthEast 80 369. 24.6 15.0
## 9 Transplant South 54 393. 59.3 6.63
## 10 Transplant West 20 609. 105. 5.8
# Seed utilisation considered
ggplot(SeedMethod %>% filter(method %in% c("DirectSow", "Transplant")) )+
geom_point(aes(x = SeedUtilise, y = YieldHA, col = method))+
labs(x = "Seed Utilisation Rate (kg/rai)", y = "Yield (kg/rai)")+
labs(caption = "In addition to region, another available factor that affect yields is seed utilisation rate.
Its range for TP is actually greater than that for DS at country-level,
possibly creating a wider spread or yield. ")
#produce group , suing 8 and 14 as division
lowseed <- SeedMethod %>% group_by(method,region) %>% filter(SeedUtilise <= 8,method %in% c("DirectSow", "Transplant"))
lowseed$group <- rep("low", nrow(lowseed))
highseed <- SeedMethod %>% group_by(method,region) %>% filter(SeedUtilise > 8, SeedUtilise <= 14,method %in% c("DirectSow", "Transplant"))
highseed$group <- rep("high", nrow(highseed))
seedu2met <- rbind(lowseed, highseed)
ggplot(seedu2met)+
geom_histogram(aes(x=YieldHA, fill = method), position = "identity", alpha = 0.5)+
facet_grid(region~group)+
labs(caption = "Mean yield show the consistent expected trends:
higher for TP for all region and all seed utilisation group
<still confounding = management and cultivar and weather>")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
seedu2met %>% group_by(group, region, method) %>% summarise(n = n(), meanY = mean(YieldHA), sdY = sd(YieldHA), CVY = mean(YieldHA)/sd(YieldHA)) %>% filter(method %in% c("DirectSow", "Transplant"), region %in% c("North", "NorthEast", "South", "West")) %>% ggplot()+
geom_col(aes(x= group , y = CVY, fill = method), position = "dodge")+
facet_wrap(~region, ncol = 4)+
labs(caption = "For spread, CV of TP (expected to be lower) is actually lower than that of DS
in 3 out of 6 cases: in south for both seed use rates,
and in west when the seed use rate in between 8-14 kr/rai (high)
")
## `summarise()` has grouped output by 'group', 'region'. You can override using
## the `.groups` argument.
#Regression modelling * We will try using the data from Transplant (TP) and DirectSow (DS) fields to find a regression model of yield. Yield = response variable, and 1) method, 2) seed utilisation, and 3) region will be candidate explanatory variables. * Note that there are MAJOR confounding factors (variables that can affect yield that have not been controlled/known) such as varieties of rice, management (irrigation, fertilisation etc). * However, we will attempt forming the model, because at least, the trend of yields are monotonically increasing (more so for TP) with respect to seed utilisation rate for TP and DS fields. We will only use data from 4 regions (“North”, “NorthEast”, “South”, “West”) because the other two regions have no records of DS fields
#correlation between seed utilisation rate and yield
mono_df <-
SeedMethod %>% filter(method %in% c("Transplant", "DirectSow"), region %in% c("North", "NorthEast", "South", "West"))
mono_df %>%
ggplot() +
geom_point(mapping = aes(x = SeedUtilise, y = YieldHA, col = method))
mod <- lm(YieldHA ~ method * SeedUtilise * region, data = mono_df)
coef(mod)
## (Intercept)
## 639.04245
## methodTransplant
## -41.88269
## SeedUtilise
## -23.05364
## regionNorthEast
## -335.89031
## regionSouth
## -346.49323
## regionWest
## 131.16428
## methodTransplant:SeedUtilise
## 21.79267
## methodTransplant:regionNorthEast
## 30.89769
## methodTransplant:regionSouth
## 83.74198
## methodTransplant:regionWest
## -482.46376
## SeedUtilise:regionNorthEast
## 27.67144
## SeedUtilise:regionSouth
## 27.88834
## SeedUtilise:regionWest
## -17.08233
## methodTransplant:SeedUtilise:regionNorthEast
## -16.52435
## methodTransplant:SeedUtilise:regionSouth
## -19.49284
## methodTransplant:SeedUtilise:regionWest
## 49.76579
par(mfrow = c(2, 2))
plot(mod) #residuals do not seem to spread randomly
#drop1(mod, test = "F")
m1 <- lm(YieldHA ~ method + SeedUtilise, data = mono_df)
coef(m1)
## (Intercept) methodTransplant SeedUtilise
## 132.56982 80.13114 26.28754
par(mfrow = c(2, 2))
plot(m1)
m2 <- lm(YieldHA ~ method + SeedUtilise + region, data = mono_df)
coef(m2)
## (Intercept) methodTransplant SeedUtilise regionNorthEast
## 357.887466 97.203935 8.608792 -134.144122
## regionSouth regionWest
## -119.691190 25.344632
par(mfrow = c(2, 2))
plot(m2)
mono_df$fittedYieldfull <- fitted(mod)
mono_df$fittedYieldMS <- fitted(m1)
mono_df$fittedYieldMSR <- fitted(m2)
colmodel <-
c("M*S*R" = "red",
"M+S+R" = "darkgreen",
"M+S" = "blue")
ggplot(mono_df) +
#geom_point(aes(x = SeedUtilise, y = YieldHA, col = method), shape = 2)+
geom_point(aes(x = SeedUtilise, y = YieldHA, col = method), shape = 2) +
geom_point(aes(x = SeedUtilise, y = fittedYieldfull, col = "M*S*R"), size = 1) +
geom_point(aes(x = SeedUtilise, y = fittedYieldMS, col = "M+S"), size = 1) +
geom_point(aes(x = SeedUtilise, y = fittedYieldMSR, col = "M+S+R"), size = 1) +
labs(x = "Seed Utilisation Rate (kg/rai)", y = "Yield (kg/rai)") +
scale_color_manual(values = colmodel) +
labs(title = "Fitted values of Yield from 3 models", color = "Model factors") +
#facet_wrap(~region)
facet_wrap( ~ method)
#facet_grid(method~region)
#compare models
#1. using anova
mod <- lm(YieldHA ~ method * SeedUtilise * region, data = mono_df)
m1 <- lm(YieldHA ~ method + SeedUtilise, data = mono_df)
m2 <- lm(YieldHA ~ method + SeedUtilise + region, data = mono_df)
anova(mod, m1, m2)
## Analysis of Variance Table
##
## Model 1: YieldHA ~ method * SeedUtilise * region
## Model 2: YieldHA ~ method + SeedUtilise
## Model 3: YieldHA ~ method + SeedUtilise + region
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 299 650915
## 2 312 1977556 -13 -1326641 46.877 < 2.2e-16 ***
## 3 309 1151450 3 826106 126.492 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# F is high -> reject null that smaller model is better, so this means mod is better and the interactions between factors are significant (can help explain the variation in yield)
#2. also try using AIC - stepwise regression: start with no predictors, then sequentially add the most contributive predictors (like forward selection). After adding each new variable, remove any variables that no longer provide an improvement in the model fit (like backward selection).
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
step.model <- stepAIC(mod, direction = "both",
trace = FALSE)
summary(step.model)
##
## Call:
## lm(formula = YieldHA ~ method * SeedUtilise * region, data = mono_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -142.005 -25.834 -1.738 21.740 143.289
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 639.042 55.790 11.454
## methodTransplant -41.883 80.552 -0.520
## SeedUtilise -23.054 5.416 -4.256
## regionNorthEast -335.890 70.034 -4.796
## regionSouth -346.493 73.224 -4.732
## regionWest 131.164 100.847 1.301
## methodTransplant:SeedUtilise 21.793 7.409 2.941
## methodTransplant:regionNorthEast 30.898 97.788 0.316
## methodTransplant:regionSouth 83.742 95.687 0.875
## methodTransplant:regionWest -482.464 130.398 -3.700
## SeedUtilise:regionNorthEast 27.671 7.200 3.843
## SeedUtilise:regionSouth 27.888 8.904 3.132
## SeedUtilise:regionWest -17.082 10.281 -1.662
## methodTransplant:SeedUtilise:regionNorthEast -16.524 9.915 -1.667
## methodTransplant:SeedUtilise:regionSouth -19.493 10.508 -1.855
## methodTransplant:SeedUtilise:regionWest 49.766 12.504 3.980
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## methodTransplant 0.603488
## SeedUtilise 2.78e-05 ***
## regionNorthEast 2.56e-06 ***
## regionSouth 3.44e-06 ***
## regionWest 0.194386
## methodTransplant:SeedUtilise 0.003522 **
## methodTransplant:regionNorthEast 0.752249
## methodTransplant:regionSouth 0.382186
## methodTransplant:regionWest 0.000257 ***
## SeedUtilise:regionNorthEast 0.000148 ***
## SeedUtilise:regionSouth 0.001907 **
## SeedUtilise:regionWest 0.097656 .
## methodTransplant:SeedUtilise:regionNorthEast 0.096629 .
## methodTransplant:SeedUtilise:regionSouth 0.064562 .
## methodTransplant:SeedUtilise:regionWest 8.65e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46.66 on 299 degrees of freedom
## Multiple R-squared: 0.8254, Adjusted R-squared: 0.8166
## F-statistic: 94.23 on 15 and 299 DF, p-value: < 2.2e-16
as.data.frame(coef(step.model))
## coef(step.model)
## (Intercept) 639.04245
## methodTransplant -41.88269
## SeedUtilise -23.05364
## regionNorthEast -335.89031
## regionSouth -346.49323
## regionWest 131.16428
## methodTransplant:SeedUtilise 21.79267
## methodTransplant:regionNorthEast 30.89769
## methodTransplant:regionSouth 83.74198
## methodTransplant:regionWest -482.46376
## SeedUtilise:regionNorthEast 27.67144
## SeedUtilise:regionSouth 27.88834
## SeedUtilise:regionWest -17.08233
## methodTransplant:SeedUtilise:regionNorthEast -16.52435
## methodTransplant:SeedUtilise:regionSouth -19.49284
## methodTransplant:SeedUtilise:regionWest 49.76579
#3. using caret, with Cross validation
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
train.control <- trainControl(method = "cv", number = 10)
step.model.ca <-
train(
YieldHA ~ method * SeedUtilise * region,
data = mono_df,
method = "lmStepAIC",
trControl = train.control,
trace = FALSE
)
# Final model coefficients
step.model.ca$finalModel
##
## Call:
## lm(formula = .outcome ~ SeedUtilise + regionNorthEast + regionSouth +
## regionWest + `methodTransplant:SeedUtilise` + `methodTransplant:regionWest` +
## `SeedUtilise:regionNorthEast` + `SeedUtilise:regionSouth` +
## `SeedUtilise:regionWest` + `methodTransplant:SeedUtilise:regionNorthEast` +
## `methodTransplant:SeedUtilise:regionSouth` + `methodTransplant:SeedUtilise:regionWest`,
## data = dat)
##
## Coefficients:
## (Intercept)
## 618.952
## SeedUtilise
## -21.121
## regionNorthEast
## -322.205
## regionSouth
## -291.104
## regionWest
## 151.255
## `methodTransplant:SeedUtilise`
## 17.975
## `methodTransplant:regionWest`
## -524.346
## `SeedUtilise:regionNorthEast`
## 26.443
## `SeedUtilise:regionSouth`
## 20.803
## `SeedUtilise:regionWest`
## -19.015
## `methodTransplant:SeedUtilise:regionNorthEast`
## -13.990
## `methodTransplant:SeedUtilise:regionSouth`
## -9.802
## `methodTransplant:SeedUtilise:regionWest`
## 53.584
# Summary of the model
summary(step.model.ca$finalModel)
##
## Call:
## lm(formula = .outcome ~ SeedUtilise + regionNorthEast + regionSouth +
## regionWest + `methodTransplant:SeedUtilise` + `methodTransplant:regionWest` +
## `SeedUtilise:regionNorthEast` + `SeedUtilise:regionSouth` +
## `SeedUtilise:regionWest` + `methodTransplant:SeedUtilise:regionNorthEast` +
## `methodTransplant:SeedUtilise:regionSouth` + `methodTransplant:SeedUtilise:regionWest`,
## data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -139.441 -25.575 -1.577 22.348 143.289
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 618.9521 40.1072 15.432
## SeedUtilise -21.1208 3.9258 -5.380
## regionNorthEast -322.2050 48.4849 -6.645
## regionSouth -291.1042 44.2578 -6.577
## regionWest 151.2547 92.8364 1.629
## `methodTransplant:SeedUtilise` 17.9748 0.9831 18.284
## `methodTransplant:regionWest` -524.3465 102.1962 -5.131
## `SeedUtilise:regionNorthEast` 26.4434 5.0197 5.268
## `SeedUtilise:regionSouth` 20.8027 4.9874 4.171
## `SeedUtilise:regionWest` -19.0152 9.5532 -1.990
## `methodTransplant:SeedUtilise:regionNorthEast` -13.9899 1.5503 -9.024
## `methodTransplant:SeedUtilise:regionSouth` -9.8022 1.9901 -4.926
## `methodTransplant:SeedUtilise:regionWest` 53.5837 10.0863 5.312
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## SeedUtilise 1.50e-07 ***
## regionNorthEast 1.41e-10 ***
## regionSouth 2.11e-10 ***
## regionWest 0.1043
## `methodTransplant:SeedUtilise` < 2e-16 ***
## `methodTransplant:regionWest` 5.17e-07 ***
## `SeedUtilise:regionNorthEast` 2.63e-07 ***
## `SeedUtilise:regionSouth` 3.97e-05 ***
## `SeedUtilise:regionWest` 0.0474 *
## `methodTransplant:SeedUtilise:regionNorthEast` < 2e-16 ***
## `methodTransplant:SeedUtilise:regionSouth` 1.39e-06 ***
## `methodTransplant:SeedUtilise:regionWest` 2.10e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46.5 on 302 degrees of freedom
## Multiple R-squared: 0.8248, Adjusted R-squared: 0.8179
## F-statistic: 118.5 on 12 and 302 DF, p-value: < 2.2e-16
#compare the result of coefficient from
as.data.frame(coef(mod))
## coef(mod)
## (Intercept) 639.04245
## methodTransplant -41.88269
## SeedUtilise -23.05364
## regionNorthEast -335.89031
## regionSouth -346.49323
## regionWest 131.16428
## methodTransplant:SeedUtilise 21.79267
## methodTransplant:regionNorthEast 30.89769
## methodTransplant:regionSouth 83.74198
## methodTransplant:regionWest -482.46376
## SeedUtilise:regionNorthEast 27.67144
## SeedUtilise:regionSouth 27.88834
## SeedUtilise:regionWest -17.08233
## methodTransplant:SeedUtilise:regionNorthEast -16.52435
## methodTransplant:SeedUtilise:regionSouth -19.49284
## methodTransplant:SeedUtilise:regionWest 49.76579
as.data.frame(coef(step.model)) #same as above as no recalculation of the coef
## coef(step.model)
## (Intercept) 639.04245
## methodTransplant -41.88269
## SeedUtilise -23.05364
## regionNorthEast -335.89031
## regionSouth -346.49323
## regionWest 131.16428
## methodTransplant:SeedUtilise 21.79267
## methodTransplant:regionNorthEast 30.89769
## methodTransplant:regionSouth 83.74198
## methodTransplant:regionWest -482.46376
## SeedUtilise:regionNorthEast 27.67144
## SeedUtilise:regionSouth 27.88834
## SeedUtilise:regionWest -17.08233
## methodTransplant:SeedUtilise:regionNorthEast -16.52435
## methodTransplant:SeedUtilise:regionSouth -19.49284
## methodTransplant:SeedUtilise:regionWest 49.76579
as.data.frame(step.model.ca$finalModel[["coefficients"]])
## step.model.ca$finalModel[["coefficients"]]
## (Intercept) 618.952053
## SeedUtilise -21.120778
## regionNorthEast -322.205049
## regionSouth -291.104237
## regionWest 151.254676
## `methodTransplant:SeedUtilise` 17.974759
## `methodTransplant:regionWest` -524.346453
## `SeedUtilise:regionNorthEast` 26.443400
## `SeedUtilise:regionSouth` 20.802718
## `SeedUtilise:regionWest` -19.015196
## `methodTransplant:SeedUtilise:regionNorthEast` -13.989920
## `methodTransplant:SeedUtilise:regionSouth` -9.802151
## `methodTransplant:SeedUtilise:regionWest` 53.583704
# Model accuracy
step.model.ca$results
## parameter RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 none 47.35933 0.8136577 35.78955 5.027525 0.03911718 2.949323
#using content from http://www.sthda.com/english/articles/37-model-selection-essentials-in-r/154-stepwise-regression-essentials-in-r/
testdataf <- data.frame(region = rep(c( "North" , "NorthEast", "South" , "West"), 3),SeedUtilise = c(rep(4,4),rep(8,4),rep(12,4)))
testdataf <- rbind(testdataf,testdataf)
testdataf$method <- c(rep("DirectSow",12) , rep("Transplant",12))
predict(mod, newdata = testdataf)
## 1 2 3 4 5 6 7 8
## 546.8279 321.6233 311.8880 609.6628 454.6133 340.0945 331.2268 449.1189
## 9 10 11 12 13 14 15 16
## 362.3987 358.5657 350.5656 288.5750 592.1159 331.7116 362.9466 371.5502
## 17 18 19 20 21 22 23 24
## 587.0720 371.2561 391.4847 497.2402 582.0281 410.8006 420.0229 622.9301
testdataf$fitted <- predict(mod, newdata = testdataf)
ggplot(testdataf)+
geom_col(aes(y=fitted,x = SeedUtilise, fill = method), position = "dodge")+
facet_wrap(~region, nrow = 1)+
scale_x_continuous(breaks = c(4,8,12))+
labs(x= "Seed use rate (kg/rai)", y = "fitted yield from the best model (kg/rai)", caption = "In NE, the effects on yields of TP and DS are the most similar.
The increasing yield with seed use rates are also seen with both planting types in the South ans in NE.
However, in N, it seems that the higher seed use rate caused the lower yield.
Lastly, in the West, only for TP that the increasing trends with seed use rate is seen.
Therefore, we will look further into these 2 regions to understand these unusal results.")
#Plot of North
ggplot(SeedMethod %>% filter(region == "North", method %in% c("Transplant","DirectSow")))+
geom_point(aes(y=YieldHA,x = SeedUtilise, col = Province))+
facet_wrap(~method)+
labs(x = "Seed use rate (kg/rai)", y = "Recorded Yield (kg/rai)", title = "North", caption = "The raw data do not contain seed use rate = 4 kg/rai for DS and TP, and no 8 kg/rai for TP,
it would have been an extrapolation at regional level when we examplified the previous graph.
For DS, only Lamphun and ChiangMai truly showed downward yield trends with increasing seed use.
For TP, it is hard to notice the trends.
However, one trend is that data points from the same provinces cluster quite close together")
#Plot of West
ggplot(SeedMethod %>% filter(region == "West", method %in% c("Transplant","DirectSow")))+
geom_point(aes(y=YieldHA,x = SeedUtilise, col = Province))+
facet_wrap(~method)+
labs(x = "Seed use rate (kg/rai)", y = "Recorded Yield (kg/rai)", title = "West",caption = "For DS, data were only from two provinces,
but, again, too few data points per province to further the investigation
and data points from the same provinces cluster quite close together ")
Summary: The best model is when using all three factors including the interaction terms, according to stepwise regression using AIC with 10-fold cross validation (Yield ~ method * SeedUtilise * region). The example new input data set shows in From the performance metric table, on average the model is off by 36-47 kg/rai. The unrealistic downward trends of yield with increasing seed use in N (for both planting methods) and W (for direct sowing) can be due to 1) extrapolation, 2) province-specific effect and we cannot investigate further because 1) too few data points for each province, 2) no other management variables given for each province.